from Functions import *
from pandas import read_csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib as mpl

mpl.rcParams['figure.max_open_warning'] = 50
plt.style.use("Figures.mplstyle")

#----------------------------------------------------------------------------------------------------------------------------------
#                             Loading the Data and Filter It
#----------------------------------------------------------------------------------------------------------------------------------

data = read_csv("Oxos Data.csv")

#Here you can define additional parameters if it so pleases you
data["DHA ΔG_PCET^2"] = data["DHA ΔG_PCET"]*data["DHA ΔG_PCET"]
data["Xth ΔG_PCET^2"] = data["Xth ΔG_PCET"]*data["Xth ΔG_PCET"]
data["CHD ΔG_PCET^2"] = data["CHD ΔG_PCET"]*data["CHD ΔG_PCET"]
data["Fl ΔG_PCET^2"] = data["Fl ΔG_PCET"]*data["Fl ΔG_PCET"]

#Uncomment to make the HBonding adjustment from Snelgrove et al.
#data["DHA PCET Barrier"] = data["DHA PCET Barrier"] - 1.4*8.3*data["Abraham Alpha*Beta"]

excludedIndices = []
excludedMetals = []
filter = (data["Main Rate"] == 1)
for index in excludedIndices:
	filter = filter & (data["Index"] != index)
for metal in excludedMetals:
	filter = filter & (data["Metal"] != metal)

data = data[filter]

withheldIndices = []
withheldMetals = []
toTrain = (data["Training"] == 1)
for index in withheldIndices:
	toTrain = toTrain & (data["Index"] != index)
for metal in withheldMetals:
	toTrain = toTrain & (data["Metal"] != metal)

training = data[toTrain]
testing = data[~toTrain]

print("Training Set:")
print(training)
print()
print("Testing Set:")
print(testing)
print()


#----------------------------------------------------------------------------------------------------------------------------------
#                             Regressions on all Oxo Data
#----------------------------------------------------------------------------------------------------------------------------------

#Each "regression" has:
#0	A list of characteristics to include
#1	A string for the graph title
#2	A String for the table title (and .svg filename)
#3	An int for how many of the parameters at the end to F-test
#4	A boolean for whether to plot the test set or not
regressions = [

	#Main text regressions and "For completeness" Regressions
	[["DHA ΔG_PCET"], "Regression with only ΔG_PCET", "ΔG_PCET Only", 1, True],
	[["DHA ΔG_PCET", "%BV Tot", "%BV Dev"], "Regression with ΔG_PCET and %BV Sterics", "DHA ΔG_PCET and %BV Sterics", 2, True],
	[["DHA ΔG_PCET", "IBO Spin O"], "Regression with ΔG_PCET and IBO spin Density", "DHA ΔG_PCET and Spin Density", 1, True],
	[["DHA ΔG_PCET", "Spin Excitation"], "Regression with ΔG_PCET and Spin Excitation", "DHA ΔG_PCET and Spin Excitation", 1, True],
	[["DHA ΔG_PCET", "DHA |η| (G)"], "Regression with ΔG_PCET and the Magnitude of the Asynchronicity", "DHA ΔG_PCET with only Abs(Eta)", 1, True],
	[["DHA ΔG_PCET", "DHA ΔG_PT", "DHA ΔG_ET"], "Regression with ΔG_PCET, ΔG_PT, and ΔG_ET", "DHA ΔG_PCET, ΔG_PT, and ΔG_ET", 2, True],
	[["DHA ΔG_PCET", "DHA ΔG_ET", "DHA ΔG_PT"], "Regression with ΔG_PCET, ΔG_PT, and ΔG_ET", "DHA ΔG_PCET, ΔG_ET, and ΔG_PT", 1, True],
	[["DHA ΔG_PCET", "DHA ΔG_PT", "DHA ΔG_ET"], "Regression with ΔG_PCET, ΔG_PT, and ΔG_ET", "DHA ΔG_PCET, ΔG_PT, and ΔG_ET", 1, True],
	[["DHA ΔG_PCET", "DHA ΔG_PCET^2"], "Regression with ΔG_PCET and ΔG_PCET^2", "DHA ΔG_PCET and ΔG_PCET^2", 1, True],
	[["DHA ΔE_PCET"], "Regression with only ΔE_PCET", "ΔE_PCET Only", 1, True],
	[["DHA ΔE_PCET", "DHA ΔE_PT", "DHA ΔE_ET"], "Regression with ΔE_PCET, ΔE_PT, and ΔE_ET", "DHA ΔE_PCET, ΔE_PT, and ΔE_ET", 2, True],
	
	#Regression to demonstrate that we seem to have temperature dependence covered
	[["DHA ΔG_PCET", "kT"], "Regression with ΔG_PCET and an Entropy Intercept", "DHA ΔG_PCET and Entropy", 1, True],

	#Additional Sterics Regressions
	[["DHA ΔG_PCET", "Height", "Max Angle"], "Regression with ΔG_PCET and Height, Max Angle", "DHA ΔG_PCET and Height, Max Angle", 2, True],

	#Reorganization Regressions
	[["DHA ΔG_PCET", "Oxo λ", "Hydroxide λ"], "Regression with ΔG_PCET and Reorganization", "DHA ΔG_PCET and  λs", 2, True],
	[["DHA ΔG_PCET", "Oxo Stretch", "Hydroxide Stretch"], "Regression with ΔG_PCET and Stretching Energies", "DHA ΔG_PCET and Stretching", 2, True],
	[["DHA ΔG_PCET", "ΔLength M-O", "Total ΔLength M-L"], "Regression with ΔG_PCET and ΔL(M,O) and ΔL(M,L)", "DHA ΔG_PCET and ΔLengths", 2, True],
	[["DHA ΔG_PCET", "Oxo λ", "Hydroxide λ", "DHA |η| (G)"], "Regression with ΔG_PCET, Reorganization", "DHA ΔG_PCET and  λs, abs(η)", 3, True],
	[["DHA ΔG_PCET", "Oxo Stretch", "Hydroxide Stretch", "DHA |η| (G)"], "Regression with ΔG_PCET, Stretching Energies, |η|", "DHA ΔG_PCET and Stretching, abs(η)", 3, True],
	[["DHA ΔG_PCET", "ΔLength M-O", "Total ΔLength M-L", "DHA |η| (G)"], "Regression with ΔG_PCET, ΔL(M,O) and ΔL(M,L), |η|", "DHA ΔG_PCET and ΔLengths, abs(η)", 3, True],
]

scores = []
errs = []
looScores = []
looErrs = []
CVErrs = []
Fs = []

maxShortName = 0

for regression in regressions:
	if len(regression) == 0:
		input()
		continue
	svgfilename = None#regression[2] #Delete "None#" to save figures instead of showing them
	model, F, pval, tErr, cvErr, looPredictions = fitAndEvaluate(regression[0], regression[3], training, "DHA PCET Barrier")
	plotModel(regression[0], model, regression[1], training, yVal = "DHA PCET Barrier", figName=svgfilename,
		xTicksMajor=ticksMajorOxo, xTicksMinor=ticksMinorOxo, yTicksMajor=ticksMajorOxo, yTicksMinor=ticksMinorOxo)
	if regression[4]:
		plotModel(regression[0], model, regression[1], training, test=testing, yVal = "DHA PCET Barrier", figName=svgfilename,
			xTicksMajor=ticksMajorOxo, xTicksMinor=ticksMinorOxo, yTicksMajor=ticksMajorOxo, yTicksMinor=ticksMinorOxo)
	
	scores.append(r2_score(training["DHA PCET Barrier"], model.predict(training[regression[0]])))
	errs.append(mean_squared_error(training["DHA PCET Barrier"], model.predict(training[regression[0]])))
	looScores.append(r2_score(training["DHA PCET Barrier"], looPredictions))
	looErrs.append(mean_squared_error(training["DHA PCET Barrier"], looPredictions))
	CVErrs.append(cvErr.mean())
	Fs.append(pval)
	if len(regression[2]) > maxShortName:
		maxShortName = len(regression[2])
	

print("                                   SUMMARY OF RESULTS ON DHA TRAINING SET           ")
print("\n"+(' '*maxShortName)+" \tR^2\tErr\tLOO R^2\tLOO Err\tCV Err\tSignificant?")
for i in range(len(regressions)):
	print(((maxShortName-len(regressions[i][2]))*' ')+regressions[i][2]+":\t"+"{:.2f}".format(scores[i])+"\t"+"{:.2f}".format(errs[i])+"\t"+
		"{:.2f}".format(looScores[i])+"\t"+"{:.2f}".format(looErrs[i])+"\t"+"{:.2f}".format(CVErrs[i])+"\t"+str(Fs[i]))


input("\nHit Enter Twice to Continue")
input()


#----------------------------------------------------------------------------------------------------------------------------------
#                             Regressions on CoIIIO Data
#----------------------------------------------------------------------------------------------------------------------------------

CoData = read_csv("CoIII Oxo Data.csv")
CoData = CoData[CoData["k2"].notnull()]
CoData["Apparent Barrier"] = -(296.15*1.987/1000)*(np.log(CoData["k2"].divide(296.15)) - np.log(8.617333e-5/4.135668e-15))
CoData["Effective Barrier"] = CoData["Apparent Barrier"] - 0.59 * (1.5*np.log(2*3.14159*(532.38*CoData["Mass"]/(532.38+CoData["Mass"]))*(3.4232000e7)*0.59/((4.5563e-6)**2)) + np.log(1000/6.022e23) + 2.5)

CoData["Eta Abs (kcal/mol)"] = np.abs(CoData["ΔG_PT"] - CoData["ΔG_ET"])*0.707
CoData["Eta (kcal/mol)"] = (CoData["ΔG_PT"] - CoData["ΔG_ET"])*0.707

CoTest = CoData[(CoData["Substrate"] == "1,1,3,3-tetraphenylpropene")]
CoData = CoData[(CoData["Substrate"] != "1,1,3,3-tetraphenylpropene")]
	#Tetraphenyllpropene has notably more steric hinderence that messes with the fit
	
print(data[["%BV Tot", "%BV Dev", "Height", "Max Angle", "Depth", "Min Angle"]].corr())

print("CoIIIO Data:")
print(CoData)
print()

regressions = [
	[["ΔG_PT"], "Regression with only ΔG_PT", "CoIII ΔG_PT only", 1],
	[["ΔG_PCET"], "Regression with only ΔG_PCET", "CoIII ΔG_PCET only", 1],
	[["ΔG_PT", "ΔG_PCET"], "Regression with both ΔG_PT and ΔG_PCET", "CoIII ΔG_PT and ΔG_PCET", 1],
	[["ΔG_PT", "ΔG_PCET", "%BV"], "Regression with ΔG_PT, ΔG_PCET, and %BV Sterics", "CoIII ΔG_PT, ΔG_PCET, and %BV Sterics", 1],
	[["ΔG_PT", "ΔG_ET"], "Regression with both ΔG_PT, ΔG_PCET, and ΔG_ET", "CoIII ΔG_PT and ΔG_ET", 1],
	[["ΔG_PT", "ΔG_PCET", "ΔG_ET"], "Regression with both ΔG_PT, ΔG_PCET, and ΔG_ET", "CoIII ΔG_PT, ΔG_PCET, and ΔG_ET", 1]
]

scores = []
errs = []
looScores = []
looErrs = []
CVErrs = []
Fs = []

maxShortName = 0

for regression in regressions:
	if len(regression) == 0:
		input()
		continue
	svgfilename = None#regression[2]
	model, F, pVal, tErr, cvErr, looPredictions = fitAndEvaluate(regression[0], regression[3], CoData, yVal = "Effective Barrier")
	plotModel(regression[0], model, regression[1], CoData, yVal = "Effective Barrier", limits=[-2,6], figName = svgfilename, test=CoTest,
		xTicksMajor=ticksMajorCoO, xTicksMinor=ticksMinorCoO, yTicksMajor=ticksMajorCoO, yTicksMinor=ticksMinorCoO)
	scores.append(r2_score(CoData["Effective Barrier"], model.predict(CoData[regression[0]])))
	errs.append(mean_squared_error(CoData["Effective Barrier"], model.predict(CoData[regression[0]])))
	looScores.append(r2_score(CoData["Effective Barrier"], looPredictions))
	looErrs.append(mean_squared_error(CoData["Effective Barrier"], looPredictions))
	CVErrs.append(cvErr.mean())
	Fs.append(pVal)
	if len(regression[2]) > maxShortName:
		maxShortName = len(regression[2])

print("                                   SUMMARY OF RESULTS ON CoIII DATA SET           ")
print((' '*maxShortName)+" \tR^2\tErr\tLOO R^2\tLOO Err\tCV Err\tSignificant?")
for i in range(len(regressions)):
	print(((maxShortName-len(regressions[i][2]))*' ')+regressions[i][2]+":\t"+"{:.2f}".format(scores[i])+"\t"+"{:.2f}".format(errs[i])+"\t"+
		"{:.2f}".format(looScores[i])+"\t"+"{:.2f}".format(looErrs[i])+"\t"+"{:.2f}".format(CVErrs[i])+"\t"+str(Fs[i]))

input("\nHit Enter Twice to Continue")
input()

#----------------------------------------------------------------------------------------------------------------------------------
#                             Regressions on RuIVO Data
#----------------------------------------------------------------------------------------------------------------------------------

RuData = read_csv("RuIV Oxo Data.csv")
RuTrain = RuData[RuData["Training"] == 1]
RuTest = RuData[RuData["Training"] == 0]

regressions = [
	[["ΔG_PCET"], "Regression with only ΔG_PCET", "RuIV ΔG_PCET only", 1],
	[["ΔG_PCET", "ΔG_PT", "ΔG_ET"], "Regression with both ΔG_PT, ΔG_PCET, and ΔG_ET", "RuIV ΔG_PCET, ΔG_PT, and ΔG_ET", 2],
]

scores = []
errs = []
looScores = []
looErrs = []
CVErrs = []
Fs = []

maxShortName = 0

for regression in regressions:
	if len(regression) == 0:
		input()
		continue
	svgfilename = None#regression[2]
	model, F, pVal, tErr, cvErr, looPredictions = fitAndEvaluate(regression[0], regression[3], RuTrain, yVal = "Sub PCET Barrier")
	plotModel(regression[0], model, regression[1], RuTrain, yVal = "Sub PCET Barrier", limits=[1,11], figName = svgfilename, test=RuTest, label="Training Substrates",
		xTicksMajor=ticksMajorRuO, xTicksMinor=ticksMinorRuO, yTicksMajor=ticksMajorRuO, yTicksMinor=ticksMinorRuO)
	scores.append(r2_score(RuTrain["Sub PCET Barrier"], model.predict(RuTrain[regression[0]])))
	errs.append(mean_squared_error(RuTrain["Sub PCET Barrier"], model.predict(RuTrain[regression[0]])))
	looScores.append(r2_score(RuTrain["Sub PCET Barrier"], looPredictions))
	looErrs.append(mean_squared_error(RuTrain["Sub PCET Barrier"], looPredictions))
	CVErrs.append(cvErr.mean())
	Fs.append(pVal)
	if len(regression[2]) > maxShortName:
		maxShortName = len(regression[2])

print("                                   SUMMARY OF RESULTS ON RuIV DATA SET           ")
print((' '*maxShortName)+" \tR^2\tErr\tLOO R^2\tLOO Err\tCV Err\tSignificant?")
for i in range(len(regressions)):
	print(((maxShortName-len(regressions[i][2]))*' ')+regressions[i][2]+":\t"+"{:.2f}".format(scores[i])+"\t"+"{:.2f}".format(errs[i])+"\t"+
		"{:.2f}".format(looScores[i])+"\t"+"{:.2f}".format(looErrs[i])+"\t"+"{:.2f}".format(CVErrs[i])+"\t"+str(Fs[i]))

input("\nHit Enter Twice to Continue")
input()


#----------------------------------------------------------------------------------------------------------------------------------
#                             Regressions on Multiple Substrates
#----------------------------------------------------------------------------------------------------------------------------------

regressions = [
	[["Sub ΔG_PCET"], "Regression with only ΔG_PCET", "ΔG_PCET Only", 1],
	[["Sub ΔG_PCET", "%BV Tot", "%BV Dev"], "Regression with ΔG_PCET and %BV Sterics", "Sub ΔG_PCET and %BV Sterics", 2],
	[["Sub ΔG_PCET", "IBO Spin O"], "Regression with ΔG_PCET and IBO spin Density", "Sub ΔG_PCET and Spin Density", 1],
	[["Sub ΔG_PCET", "Spin Excitation"], "Regression with ΔG_PCET and Spin Excitation", "Sub ΔG_PCET and Spin Excitation", 1],
	[["Sub ΔG_PCET", "Sub |η| (G)"], "Regression with ΔG_PCET and Charge Transfer Energetics", "Sub ΔG_PCET with only Abs(Eta)", 1],
	[["Sub ΔG_PCET", "Sub ΔG_PT", "Sub ΔG_ET"], "Regression with ΔG_PCET, ΔG_PT, and ΔG_ET", "Sub ΔG_PCET, ΔG_PT, and ΔG_ET", 2],
]

noSubIndices = [26, 27]
noSubMetals = ["Ru"]
allSubFilter = data["Main Rate"]==1
for index in noSubIndices:
	allSubFilter = allSubFilter & (data["Index"] != index)
for metal in noSubMetals:
	allSubFilter = allSubFilter & (data["Metal"] != metal)

allXVals = ["Index", "Sub PCET Barrier"]
for regression in regressions:
	for xVal in regression[0]:
		if not xVal in allXVals:
			allXVals.append(xVal)
			
allSubData, subStarts = compressSubstrates(data[allSubFilter], allXVals)

print("Multiple Substrates:")
print(allSubData)
print()

scores = []
errs = []
looScores = []
looErrs = []
losoScores = []
losoErrs = []
CVErrs = []
Fs = []

maxShortName = 0

for regression in regressions:
	
	if len(regression) == 0:
		input()
		continue
	
	model, F, pVal, tErr, cvErr, looPredictions = fitAndEvaluate(regression[0], regression[3], allSubData, yVal = "Sub PCET Barrier", grouping = "Index")
	
	fig, axs = plt.subplots(constrained_layout=True)
	for i in range(len(subStarts)-1):
		plotModel(regression[0], model, regression[1], allSubData.iloc[subStarts[i]:subStarts[i+1],:], yVal = "Sub PCET Barrier", figAxs=[fig, axs], marker=subMarks[i], label=substrates[i],
			xTicksMajor=ticksMajorOxo, xTicksMinor=ticksMinorOxo, yTicksMajor=ticksMajorOxo, yTicksMinor=ticksMinorOxo)
	#fig.savefig(regression[2]+".svg", dpi=600, format='svg')
		
	scores.append(r2_score(allSubData["Sub PCET Barrier"], model.predict(allSubData[regression[0]])))
	errs.append(mean_squared_error(allSubData["Sub PCET Barrier"], model.predict(allSubData[regression[0]])))
	looScores.append(r2_score(allSubData["Sub PCET Barrier"], looPredictions))
	looErrs.append(mean_squared_error(allSubData["Sub PCET Barrier"], looPredictions))
	
	if len(regression[2]) > maxShortName:
		maxShortName = len(regression[2])
	
print("                                   SUMMARY OF RESULTS ON DHA, CHD, XANTHENE, AND, FLOURENE           ")
print("\n"+(' '*maxShortName)+" \tR^2\tErr\tLOO R^2\tLOO Err")
for i in range(len(regressions)):
	print(((maxShortName-len(regressions[i][2]))*' ')+regressions[i][2]+":\t"+"{:.2f}".format(scores[i])+"\t"+"{:.2f}".format(errs[i])+"\t"+
		"{:.2f}".format(looScores[i])+"\t"+"{:.2f}".format(looErrs[i]))


input("\nHit Enter Twice to Continue")
input()

#----------------------------------------------------------------------------------------------------------------------------------
#                             Getting Data for Main Text Figures
#----------------------------------------------------------------------------------------------------------------------------------



#Building and plotting the best model for all oxos
model, F, pval, tErr, cvErr, looPredictions = fitAndEvaluate(["DHA ΔG_PCET", "DHA ΔG_PT", "DHA ΔG_ET"], 3, training, yVal = "DHA PCET Barrier")

print("\nFit to all Oxos")
print("\nTraining data and prediction")
experimental = training["DHA PCET Barrier"].values
predicted = model.predict(training[["DHA ΔG_PCET", "DHA ΔG_PT", "DHA ΔG_ET"]])
for i in range(experimental.size):
	print(experimental[i], predicted[i])
print("\nTesting data and prediction")
experimental = testing["DHA PCET Barrier"].values
predicted = model.predict(testing[["DHA ΔG_PCET", "DHA ΔG_PT", "DHA ΔG_ET"]])
for i in range(experimental.size):
	print(experimental[i], predicted[i])

#Plotting CoIII data with BDFE and BDFE,pKa
print("CoIII Data, pKa Prediction, pKa/BDFE Prediction")
experimental = CoData["Effective Barrier"].values

model, _, _, _, _, _ = fitAndEvaluate(["ΔG_PT"], 1, CoData, yVal = "Effective Barrier")
prediction_wo_BDFE = model.predict(CoData[["ΔG_PT"]])

model, _, _, _, _, _ = fitAndEvaluate(["ΔG_PT", "ΔG_PCET"], 1, CoData, yVal = "Effective Barrier")
print("Prediction with pKa and BDFE")
prediction_w_BDFE = model.predict(CoData[["ΔG_PT", "ΔG_PCET"]])

for i in range(experimental.size):
	print(experimental[i], prediction_wo_BDFE[i], prediction_w_BDFE[i])

#Now we fit on all the data, and print it per substrate

model, F, pval, tErr, cvErr, looPredictions = fitAndEvaluate(["Sub ΔG_PCET", "Sub ΔG_PT", "Sub ΔG_ET"], 3, allSubData, yVal = "Sub PCET Barrier")

for i in range(len(subStarts)-1):
	print("Data for", substrates[i], "Observed then Predicted")	
	experimental = (allSubData.iloc[subStarts[i]:subStarts[i+1],:])["Sub PCET Barrier"].values
	predicted = model.predict((allSubData.iloc[subStarts[i]:subStarts[i+1],:])[["Sub ΔG_PCET", "Sub ΔG_PT", "Sub ΔG_ET"]])
	for i in range(experimental.size):
		print(experimental[i], predicted[i])


input("\nHit Enter Twice to Continue")
input()
